{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## HashingTF and CountVectorizer\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "twitter = spark.createDataFrame([\n",
    "                                ('Wenqiang is a spark expert', 'Wenqiang', 1.0),\n",
    "                                ('Ming is learning spark', 'Ming', 0.0)],\n",
    "                                ['text', 'id', 'label']\n",
    "                               )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+--------+-----+\n",
      "|                text|      id|label|\n",
      "+--------------------+--------+-----+\n",
      "|Wenqiang is a spa...|Wenqiang|  1.0|\n",
      "|Ming is learning ...|    Ming|  0.0|\n",
      "+--------------------+--------+-----+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "twitter.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Tokenization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from pyspark.ml.feature import Tokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+--------+-----+--------------------+\n",
      "|                text|      id|label|              tokens|\n",
      "+--------------------+--------+-----+--------------------+\n",
      "|Wenqiang is a spa...|Wenqiang|  1.0|[wenqiang, is, a,...|\n",
      "|Ming is learning ...|    Ming|  0.0|[ming, is, learni...|\n",
      "+--------------------+--------+-----+--------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "tokenizer_mod = Tokenizer(inputCol='text', outputCol='tokens')\n",
    "twitter_tokens = tokenizer_mod.transform(twitter)\n",
    "twitter_tokens.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### HashingTF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from pyspark.ml.feature import HashingTF\n",
    "hashingTF_mod = HashingTF(numFeatures=pow(2,4), inputCol='tokens', outputCol='features')\n",
    "hashingTF_twitter = hashingTF_mod.transform(twitter_tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------------+--------+-----+--------------------------------+---------------------------------+\n",
      "|text                      |id      |label|tokens                          |features                         |\n",
      "+--------------------------+--------+-----+--------------------------------+---------------------------------+\n",
      "|Wenqiang is a spark expert|Wenqiang|1.0  |[wenqiang, is, a, spark, expert]|(16,[1,2,9,13],[2.0,1.0,1.0,1.0])|\n",
      "|Ming is learning spark    |Ming    |0.0  |[ming, is, learning, spark]     |(16,[0,1,14],[1.0,2.0,1.0])      |\n",
      "+--------------------------+--------+-----+--------------------------------+---------------------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "hashingTF_twitter.show(truncate=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### CountVectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from pyspark.ml.feature import CountVectorizer\n",
    "count_vectorizer = CountVectorizer(vocabSize=pow(2,4), inputCol='tokens', outputCol='features')\n",
    "countVectorizer_mod = count_vectorizer.fit(twitter_tokens)\n",
    "countVectorizer_twitter = countVectorizer_mod.transform(twitter_tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------------+--------+-----+--------------------------------+-------------------------------------+\n",
      "|text                      |id      |label|tokens                          |features                             |\n",
      "+--------------------------+--------+-----+--------------------------------+-------------------------------------+\n",
      "|Wenqiang is a spark expert|Wenqiang|1.0  |[wenqiang, is, a, spark, expert]|(7,[0,1,2,3,5],[1.0,1.0,1.0,1.0,1.0])|\n",
      "|Ming is learning spark    |Ming    |0.0  |[ming, is, learning, spark]     |(7,[0,1,4,6],[1.0,1.0,1.0,1.0])      |\n",
      "+--------------------------+--------+-----+--------------------------------+-------------------------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "countVectorizer_twitter.show(truncate=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [Root]",
   "language": "python",
   "name": "Python [Root]"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
